#Import required packages
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import os
#Load twitter posts archive
archive = pd.read_csv('twitter-archive-enhanced.csv')
#Load Image predicitons dataset
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open('image_predictions.tsv', 'wb') as f:
f.write(response.content)
images = pd.read_csv('image_predictions.tsv',sep='\t')
# Query twitter API
'''consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)'''
'''list_of_tweets = []
# Tweets that can't be found are saved in the list below:
cant_find_tweets_for_those_ids = []
for tweet_id in archive['tweet_id']:
try:
list_of_tweets.append(api.get_status(tweet_id))
except Exception as e:
cant_find_tweets_for_those_ids.append(tweet_id)'''
'''my_list_of_dicts = []
for each_json_tweet in list_of_tweets:
my_list_of_dicts.append(each_json_tweet._json)'''
'''with open('tweet_json.txt', 'w') as file:
file.write(json.dumps(my_list_of_dicts, indent=4))'''
'''my_list = []
with open('tweet_json.txt', encoding='utf-8') as json_file:
all_data = json.load(json_file)
for each_dictionary in all_data:
tweet_id = each_dictionary['id']
whole_tweet = each_dictionary['text']
only_url = whole_tweet[whole_tweet.find('https'):]
favorite_count = each_dictionary['favorite_count']
retweet_count = each_dictionary['retweet_count']
created_at = each_dictionary['created_at']
whole_source = each_dictionary['source']
only_device = whole_source[whole_source.find('rel="nofollow">') + 15:-4]
source = only_device
retweeted_status = each_dictionary['retweeted_status'] = each_dictionary.get('retweeted_status', 'Original tweet')
if retweeted_status == 'Original tweet':
url = only_url
else:
retweeted_status = 'This is a retweet'
url = 'This is a retweet'
my_list.append({'tweet_id': str(tweet_id),
'favorite_count': int(favorite_count),
'retweet_count': int(retweet_count),
'url': url,
'created_at': created_at,
'source': source,
'retweeted_status': retweeted_status,
})
tweet_json = pd.DataFrame(my_list, columns = ['tweet_id', 'favorite_count',
'retweet_count', 'created_at',
'source', 'retweeted_status', 'url'])'''
tweet_json.head()
archive.info()
archive.sample(5)
#Check for null values
archive.isnull().sum()
#Value counts for the source column
archive.source.value_counts()
#check for duplicate IDs
archive[archive.tweet_id.duplicated()]
#Check for possible issue in denominator not equal to 10
archive.rating_denominator.value_counts()
archive.name.value_counts().head(20)
#Further explore the denominators
pd.set_option('display.max_colwidth', 1000)
denominator = archive.loc[archive['rating_denominator'] != 10, ['tweet_id','text','rating_numerator','rating_denominator']]
len(denominator)
#Check for possible issue in numerator not equal to 10-14
archive.rating_numerator.value_counts()
#Further explore numerators
numerator=archive.loc[archive['rating_numerator'] >20, ['tweet_id','text','rating_numerator','rating_denominator']]
numerator
len(numerator)
archive[archive.text.str.contains(r"(\d+\.\d*\/\d+)")][['text', 'rating_numerator']]
images.info()
images.sample(10)
#Check for any format/quality issues
images.p1_dog.value_counts()
images.p2_dog.value_counts()
images.p3_dog.value_counts()
images.describe()
# Checking for duplicated columns
all_columns = pd.Series(list(archive) + list(images) + list(tweet_json))
all_columns[all_columns.duplicated()]
tweet_json.info()
tweet_json.sample(10)
#Check for duplicate values
sum(tweet_json.duplicated())
#Check for null values
tweet_json.isnull().sum()
Twitter archive¶
- Timestamp column should be in time format and not as object.
- We only need user ratings and not retweets and the data set has some unnecessary columns.
- Source column data is unreadable HTML.
- The name column has a lot of non-name values. The most popular name is 'a' which is not a name at all.
Twitter Images¶
- Images, p1_conf, p2_conf and p3_conf aren’t headers properly names.
- p1, p2, and p3 values aren’t consistently named.
- Breed types for p1, p2, and p3 should be in category format and Tweet ID’s should be strings and not Integers.
- Denominator and numerator ratings have some incorrect values.
Twitter Json¶
- Delete unnecessary columns.
- The doggo, floofer, pupper and puppo columns should be one column and as categories.
- Merge all cleaned datasets for visualization
archive_clean = archive.copy()
images_clean = images.copy()
tweets_clean = tweet_json.copy()
archive_clean['timestamp'] = pd.to_datetime(archive_clean['timestamp'])
archive_clean.info()
archive_clean = archive_clean[(archive_clean['in_reply_to_status_id'].isna() == True) & (archive_clean['retweeted_status_id'].isna() == True)]
archive_clean = archive_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id',
'retweeted_status_id','retweeted_status_user_id',
'retweeted_status_timestamp'],
axis = 1)
archive_clean.info()
archive_clean.source = archive_clean.source.replace('<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Twitter for iPhone')
archive_clean.source = archive_clean.source.replace('<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>', 'Vine Make a Scene')
archive_clean.source = archive_clean.source.replace('<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'Twitter Web Client')
archive_clean.source = archive_clean.source.replace('<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>', 'Tweet Deck')
archive_clean.source=archive_clean.source.astype('category')
archive_clean.source.value_counts()
archive_clean['text_split'] = archive_clean['text'].str.split()
names = []
def extract_names(row):
# 'This is Charlie'
if row['text'].startswith('This is ') and re.match(r'[A-Z].*', row['text_split'][2]):
names.append(row['text_split'][2].strip('.').strip(','))
# 'Meet Charlie'
elif row['text'].startswith('Meet ') and re.match(r'[A-Z].*', row['text_split'][1]):
names.append(row['text_split'][1].strip('.').strip(','))
# 'Say hello to Charlie'
elif row['text'].startswith('Say hello to ') and re.match(r'[A-Z].*', row['text_split'][3]):
names.append(row['text_split'][3].strip('.').strip(','))
# 'Here we have Charlie'
elif row['text'].startswith('Here we have ') and re.match(r'[A-Z].*', row['text_split'][3]):
names.append(row['text_split'][3].strip('.').strip(','))
# 'named Charlie'
elif 'named' in row['text'] and re.match(r'[A-Z].*', row['text_split'][(row['text_split'].index('named') + 1)]):
names.append(row['text_split'][(row['text_split'].index('named') + 1)])
else:
names.append('Nameless')
archive_clean.apply(extract_names, axis=1)
len(names)
archive_clean['names'] = names
archive_clean['names'].value_counts()
images_clean= images_clean.rename(columns={'jpg_url': 'image_url', 'img_num': 'images_number',
'p1': 'prediction_1','p1_conf': 'confidence_1',
'p1_dog': 'dog_1','p2': 'prediction_2','p2_conf': 'confidence_2',
'p2_dog': 'dog_2','p3': 'prediction_3','p3_conf': 'confidence_3',
'p3_dog': 'dog_3'})
images_clean.head(5)
images_clean.prediction_1= images_clean.prediction_1.str.lower()
images_clean.prediction_2= images_clean.prediction_2.str.lower()
images_clean.prediction_3= images_clean.prediction_3.str.lower()
images_clean[['prediction_1', 'prediction_2', 'prediction_3']]
images_clean.prediction_1= images_clean.prediction_1.astype('category')
images_clean.prediction_2= images_clean.prediction_2.astype('category')
images_clean.prediction_3= images_clean.prediction_3.astype('category')
images_clean.info()
archive_clean = archive_clean.astype({'tweet_id': str})
images_clean = images_clean.astype({'tweet_id': str})
tweets_clean = tweets_clean.astype({'tweet_id':str})
archive_clean.info()
images_clean.info()
tweets_clean.info()
denom = archive_clean[archive_clean['rating_denominator'] != 10]
denom[['tweet_id','text','rating_numerator','rating_denominator']]
archive_clean.loc[archive_clean.tweet_id == 716439118184652801, ['rating_numerator']] = 11
archive_clean.loc[archive_clean.tweet_id == 716439118184652801, ['rating_denominator']] = 10
archive_clean.loc[archive_clean.tweet_id == 666287406224695296, ['rating_numerator']] = 9
archive_clean.loc[archive_clean.tweet_id == 666287406224695296, ['rating_denominator']] = 10
archive_clean.loc[archive_clean.tweet_id == 682962037429899265, ['rating_numerator']] = 10
archive_clean.loc[archive_clean.tweet_id == 682962037429899265, ['rating_denominator']] = 10
archive_clean.loc[archive_clean.tweet_id == 740373189193256964, ['rating_numerator']] = 14
archive_clean.loc[archive_clean.tweet_id == 740373189193256964, ['rating_denominator']] = 10
ids = [716439118184652801, 666287406224695296, 682962037429899265, 740373189193256964]
archive_clean.loc[archive_clean['tweet_id'].isin(ids), ['tweet_id','rating_numerator', 'rating_denominator']]
tweets_clean = tweets_clean.drop(['created_at', 'source', 'retweeted_status', 'url'], axis=1)
tweets_clean.info()
def concat_stages(row):
stages = []
all_stages = ['doggo', 'floofer', 'pupper', 'puppo']
for stage in all_stages:
if row[stage] != 'None':
stages.append(stage)
return ' '.join(sorted(stages))
archive_clean['dog_stage'] = archive_clean[['doggo', 'floofer', 'pupper', 'puppo']].apply(
concat_stages, axis=1
)
archive_clean = archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)
archive_clean.dog_stage = archive_clean.dog_stage.astype('category')
archive_clean.info()
archive_clean.dog_stage.value_counts()
#Rename dog stage values
archive_clean.dog_stage = archive_clean.dog_stage.replace('doggo pupper', 'pupper')
archive_clean.dog_stage = archive_clean.dog_stage.replace('doggo puppo', 'puppo')
archive_clean.dog_stage = archive_clean.dog_stage.replace('doggo floofer', 'floofer')
archive_clean.dog_stage.value_counts()
archive_clean1 = pd.merge(images_clean, tweets_clean, how = 'left', on = 'tweet_id')
archive_clean = pd.merge(archive_clean1, archive_clean, how = 'left', on = 'tweet_id')
archive_clean.info()
archive_clean.info()
archive_clean.to_csv('twitter_archive_master.csv',index=False, encoding = 'utf-8')
plt.figure(figsize = [10, 5])
sns.set_style("whitegrid", {'axes.grid' : False})
top_dogs = archive_clean.groupby('prediction_1').confidence_1.mean()[9::-1]
top_dogs.plot(kind = 'bar', fontsize = 12)
plt.title('Top Ten Dog Breeds', fontsize = 14)
plt.xlabel('Dog Breed', fontsize = 14)
plt.ylabel('Prediction', fontsize = 14);
#Most favored dog stage
alt.Chart(archive_clean[archive_clean['dog_stage']!= 'unknown']).mark_bar(color='blue', opacity=0.5).encode(
alt.Y('dog_stage', axis=alt.Axis(title='Dog Stage')),
alt.X('average(favorite_count)', axis=alt.Axis(title='Average Like Count'))
).properties(title = 'Most Favored Dog Stage as per Likes Count')
Puppo dogs are the most liked dog stage as per likes clicked
alt.Chart(archive_clean[archive_clean['dog_stage']!= 'unknown']).mark_bar(color = 'green', opacity = 0.5).encode(
alt.Y('dog_stage', axis = alt.Axis(title = 'Dog Stage')),
alt.X('average(retweet_count)', axis = alt.Axis(title = 'Average Retweet Count'))
).properties(title = 'Most Favored Dog Stage as per Retweet Count')
Puppo dogs are also the most liked dog stage as per retweets count
#Let's check correlation between rating and dog count
#Remove outliers for a better view
rating_favorite = archive_clean.copy()
rating_favorite['rating'] = rating_favorite['rating_numerator'] / rating_favorite['rating_denominator']
rating_favorite = rating_favorite[rating_favorite.rating <= 2]
rating_favorite = rating_favorite[(rating_favorite.favorite_count <=130000)]
plt.figure(figsize = [10, 5])
ratingplot=sns.regplot(x = 'rating',y = 'favorite_count', data = rating_favorite)
ratingplot.set(ylim = (0, None))
plt.title('The Correlation between Rating and Favorite Count')
plt.xlabel('Rating')
plt.ylabel('Favorite Count')
fig =ratingplot.get_figure()
fig.savefig('ratingplot.png')
Seems like there is a positive correlation between ratings and favorite count.